source("./load_files.R")data_files_selected <-
list.files(
pattern = ".*thais_.*csv$",
recursive = TRUE,
ignore.case = TRUE
)
data_files_and_size <-
sapply(data_files_selected, file.size)
files_to_include_in_dataframe <-
tibble(
"Files" = names(data_files_and_size),
"Size (in MB)" = data_files_and_size/1E6
)
skimr::skim(files_to_include_in_dataframe)| Name | files_to_include_in_dataf… |
| Number of rows | 24 |
| Number of columns | 2 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Files | 0 | 1 | 128 | 130 | 0 | 24 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Size (in MB) | 0 | 1 | 2.18 | 3.48 | 0.32 | 0.38 | 0.6 | 2.1 | 10.99 | ▇▁▁▁▁ |
cdr <- load_cdr(names(data_files_and_size))
cdr %<>%
mutate(
expgroup = case_when(
str_detect(file, "thaisnovoheader") ~ "nh",
str_detect(file, "thais_29") ~ "29",
str_detect(file, "thais_66") ~ "66",
TRUE ~ "unknown"),
cycle = case_when(
str_detect(file, "R0_R2") ~ "R0_R2",
str_detect(file, "R0_R3") ~ "R0_R3",
str_detect(file, "R0_R4") ~ "R0_R4",
str_detect(file, "R2_R3") ~ "R2_R3",
str_detect(file, "R2_R4") ~ "R2_R4",
str_detect(file, "R3_R4") ~ "R3_R4",
TRUE ~ "unknown"),
time = case_when(
str_detect(file, "Initial") ~ "initial",
str_detect(file, "Final") ~ "final",
TRUE ~ "unknown")) %>%
select(cdr3, cycle, time, expgroup, everything())
cdr %<>%
group_by(cdr3, expgroup, cycle) %>%
arrange(desc(time), .by_group = TRUE) %>%
mutate(
fcp = cdrp / lag(cdrp, default = first(cdrp)),
fcq = quantity / lag(quantity, default = first(quantity))
) %>%
select(cdr3:quantity, fcp, fcq, everything())cdr %>%
filter(time == "final") %>%
# filter(str_detect(cycle, "R0")) %>%
group_by(expgroup, cycle, time) %>%
arrange(desc(fcp)) %>%
slice_head(prop = .1) %>%
# slice_head(n = 1000) %>%
ggplot(aes(expgroup, log10(fcp))) +
geom_violin(aes(fill = expgroup, color = expgroup), alpha = 0.5) +
geom_jitter(aes(shape = expgroup), alpha = 0.6, size = 1) +
stat_summary(
fun = mean,
fun.min = mean,
fun.max = mean,
geom = "crossbar",
# width = 0.5,
aes(color = expgroup)
) +
facet_grid(. ~ cycle)# cdr %>%
# filter(str_detect(cycle, "R0")) %>%
# filter(time == "final") %>%
# group_by(expgroup, cycle, time) %>%
# arrange(desc(fcp)) %>%
# slice_head(n = 1000) %>%
# ggplot(aes(fcp, color = expgroup, fill = expgroup)) +
# geom_density(stat = "bin", alpha = 0.3) +
# facet_grid(cycle ~ expgroup)
cdr %<>%
group_by(expgroup, cycle, time) %>%
arrange(desc(fcp)) %>%
slice_head(prop = .1) %>%
mutate(
threshold = mean(log10(fcp))
) %>%
mutate(
rich = if_else(
(log10(fcp) >= threshold) &
# (time == "final") &
# (str_detect(cycle, "R0")),
(time == "final"),
"rich",
"medium")) %>%
full_join(cdr) %>%
mutate(
rich = if_else(
is.na(rich),
"poor",
rich)) %>%
mutate(
rich = factor(rich,
levels = c("rich", "medium", "poor"))
) %>%
mutate(
threshold = if_else(
is.na(threshold),
0,
threshold
)
)
cdr %>%
filter(rich == "rich") %>%
ggplot() +
geom_violin(aes(expgroup, log10(fcp), fill = expgroup)) +
geom_jitter(aes(expgroup, log10(fcp), shape = expgroup), alpha = .2) +
facet_grid(rich ~ cycle)cdr %>%
filter(!rich == "rich") %>%
ggplot() +
geom_violin(aes(expgroup, log10(fcp), fill = expgroup)) +
geom_jitter(aes(expgroup, log10(fcp), shape = expgroup), alpha = .2) +
facet_grid(rich ~ cycle)names(cdr)## [1] "cdr3" "cycle" "time" "expgroup" "cdrp" "quantity"
## [7] "fcp" "fcq" "length" "MW" "AV" "IP"
## [13] "flex" "gravy" "SSF_Helix" "SSF_Turn" "SSF_Sheet" "n_A"
## [19] "n_C" "n_D" "n_E" "n_F" "n_G" "n_H"
## [25] "n_I" "n_K" "n_L" "n_M" "n_N" "n_P"
## [31] "n_Q" "n_R" "n_S" "n_T" "n_V" "n_W"
## [37] "n_Y" "aliphatic" "aromatic" "neutral" "positive" "negative"
## [43] "invalid" "file" "threshold" "rich"
cdr %>%
filter(rich == "rich") %>%
filter(expgroup == "29" & cycle == "R0_R4") %>%
ungroup() %>%
select(cdr3:SSF_Sheet & !c("cycle", "time", "expgroup"))## # A tibble: 13 x 14
## cdr3 cdrp quantity fcp fcq length MW AV IP flex gravy
## <chr> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 GPPH… 1.65e-1 23551 1.42e4 2.36e4 13 1590. 0.231 5.97 0.735 -1.26
## 2 DLHN… 1.32e-1 18816 1.14e4 1.88e4 9 1132. 0.222 4.20 0.739 -0.967
## 3 RYRN… 5.64e-2 8067 4.87e3 8.07e3 19 2410. 0.368 5.96 0.744 -1.71
## 4 AYPE… 1.47e-1 21041 2.12e3 3.51e3 10 1212. 0.3 4.05 0.714 -0.2
## 5 ITTV… 3.66e-3 524 3.16e2 5.24e2 14 1691. 0.286 5.83 0.702 -0.0857
## 6 RGSS… 1.66e-3 237 1.43e2 2.37e2 9 1005. 0.222 5.84 0.782 -1.12
## 7 DY 3.36e-4 48 5.80e0 9.60e0 2 296. 0.5 4.05 0.728 -2.4
## 8 ETWG… 5.59e-5 8 4.83e0 8.00e0 7 881. 0.286 4.05 0.774 -1.7
## 9 DV 3.22e-4 46 2.78e0 4.60e0 2 232. 0 4.05 0.746 0.35
## 10 QQIA… 2.80e-5 4 2.41e0 4.00e0 7 850. 0.143 4.05 0.723 -0.243
## 11 GTSS… 2.10e-5 3 1.81e0 3.00e0 11 1213. 0.182 4.05 0.742 -0.191
## 12 GVMG… 2.10e-5 3 1.81e0 3.00e0 10 1215. 0.4 4.05 0.678 0
## 13 GYSS… 2.10e-5 3 1.81e0 3.00e0 9 1049. 0.333 4.05 0.759 -1.43
## # … with 3 more variables: SSF_Helix <dbl>, SSF_Turn <dbl>, SSF_Sheet <dbl>
cdr %>%
filter(rich == "poor") %>%
filter(expgroup == "66" & cycle == "R0_R4") %>%
ungroup() %>%
select(cdr3:SSF_Sheet & !c("cycle", "time", "expgroup")) %>%
ggplot() +
geom_density(aes(MW))cdr %>%
filter(rich == "rich") %>%
filter(expgroup == "66" & cycle == "R0_R4") %>%
ungroup() %>%
select(cdr3:SSF_Sheet & !c("cycle", "time", "expgroup")) -> rich66
rich66## # A tibble: 363 x 14
## cdr3 cdrp quantity fcp fcq length MW AV IP flex gravy
## <chr> <dbl> <int> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SQDV… 0.0138 2794 860. 1397 13 1440. 0.0769 4.05 0.733 0.0769
## 2 TAGV… 0.00433 876 539. 876 8 886. 0.25 4.05 0.718 -0.075
## 3 GRPE… 0.00294 595 366. 595 10 1128. 0.1 5.32 0.764 -1.06
## 4 GDLV… 0.00240 486 299. 486 13 1588. 0.308 4.21 0.732 -0.477
## 5 ARPK… 0.00233 471 290. 471 14 1461. 0.0714 6.00 0.794 -1.26
## 6 GRWI… 0.00206 417 257. 417 8 992. 0.25 8.75 0.711 -0.338
## 7 DLDW… 0.00175 355 218. 355 9 1199. 0.333 4.05 0.705 0.244
## 8 SGAV… 0.00173 351 216. 351 13 1414. 0.154 4.37 0.721 0.331
## 9 TRKQ… 0.00163 330 203. 330 8 957. 0 11.0 0.801 -1.84
## 10 DGRY… 0.00153 310 191. 310 8 1063. 0.375 4.21 0.741 -1.9
## # … with 353 more rows, and 3 more variables: SSF_Helix <dbl>, SSF_Turn <dbl>,
## # SSF_Sheet <dbl>
names(cdr)## [1] "cdr3" "cycle" "time" "expgroup" "cdrp" "quantity"
## [7] "fcp" "fcq" "length" "MW" "AV" "IP"
## [13] "flex" "gravy" "SSF_Helix" "SSF_Turn" "SSF_Sheet" "n_A"
## [19] "n_C" "n_D" "n_E" "n_F" "n_G" "n_H"
## [25] "n_I" "n_K" "n_L" "n_M" "n_N" "n_P"
## [31] "n_Q" "n_R" "n_S" "n_T" "n_V" "n_W"
## [37] "n_Y" "aliphatic" "aromatic" "neutral" "positive" "negative"
## [43] "invalid" "file" "threshold" "rich"
cdr %>%
filter(expgroup == "66" & cycle == "R0_R4") %>%
ggplot() +
geom_violin(aes(rich, gravy, color = rich)) +
geom_jitter(aes(rich, gravy), alpha = .1)library(Rtsne)
set.seed(42)
tsne_df <- cdr %>%
# filter(!rich == "poor") %>%
filter(time == "final") %>%
filter(cycle == "R0_R4") %>%
filter(expgroup == "29")
tsne_df %>%
summarise(across(everything(), ~ all(sum(is.na(.x))))) %>%
select(where(isTRUE))## # A tibble: 1 x 2
## # Groups: expgroup, cycle [1]
## expgroup cycle
## <chr> <chr>
## 1 29 R0_R4
tsne_df %>%
filter(is.na(threshold)) %>%
select(rich, threshold, quantity)## # A tibble: 0 x 6
## # Groups: expgroup, cycle, time [0]
## # … with 6 variables: expgroup <chr>, cycle <chr>, time <chr>, rich <fct>,
## # threshold <dbl>, quantity <int>
set.seed(42)
tsne_out <-
tsne_df %>%
ungroup() %>%
select(!where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
unique() %>%
Rtsne(
X = .,
dims = 3,
perplexity = 420,
theta = 0.1,
max_iter = 2E3,
verbose = T,
pca_center = T,
pca_scale = T,
normalize = T,
eta = 200.0,
exaggeration_factor = 12.0,
num_threads = parallel::detectCores() - 2
)## Performing PCA
## Read the 1544 x 42 data matrix successfully!
## OpenMP is working. 6 threads.
## Using no_dims = 3, perplexity = 420.000000, and theta = 0.100000
## Computing input similarities...
## Building tree...
## Done in 4.72 seconds (sparsity = 0.932734)!
## Learning embedding...
## Iteration 50: error is 44.145871 (50 iterations in 7.22 seconds)
## Iteration 100: error is 44.145871 (50 iterations in 8.36 seconds)
## Iteration 150: error is 44.145871 (50 iterations in 9.83 seconds)
## Iteration 200: error is 44.145871 (50 iterations in 11.66 seconds)
## Iteration 250: error is 44.145871 (50 iterations in 14.65 seconds)
## Iteration 300: error is 1.193916 (50 iterations in 12.47 seconds)
## Iteration 350: error is 0.319184 (50 iterations in 6.44 seconds)
## Iteration 400: error is 0.241505 (50 iterations in 5.90 seconds)
## Iteration 450: error is 0.229975 (50 iterations in 6.36 seconds)
## Iteration 500: error is 0.226926 (50 iterations in 6.31 seconds)
## Iteration 550: error is 0.226181 (50 iterations in 6.22 seconds)
## Iteration 600: error is 0.225912 (50 iterations in 6.29 seconds)
## Iteration 650: error is 0.225838 (50 iterations in 6.83 seconds)
## Iteration 700: error is 0.225819 (50 iterations in 6.45 seconds)
## Iteration 750: error is 0.225798 (50 iterations in 6.39 seconds)
## Iteration 800: error is 0.225781 (50 iterations in 6.56 seconds)
## Iteration 850: error is 0.225778 (50 iterations in 6.94 seconds)
## Iteration 900: error is 0.225771 (50 iterations in 6.69 seconds)
## Iteration 950: error is 0.225767 (50 iterations in 6.80 seconds)
## Iteration 1000: error is 0.225760 (50 iterations in 6.82 seconds)
## Iteration 1050: error is 0.225743 (50 iterations in 6.83 seconds)
## Iteration 1100: error is 0.225722 (50 iterations in 6.36 seconds)
## Iteration 1150: error is 0.225723 (50 iterations in 6.43 seconds)
## Iteration 1200: error is 0.225719 (50 iterations in 6.22 seconds)
## Iteration 1250: error is 0.225719 (50 iterations in 6.67 seconds)
## Iteration 1300: error is 0.225725 (50 iterations in 6.38 seconds)
## Iteration 1350: error is 0.225728 (50 iterations in 6.39 seconds)
## Iteration 1400: error is 0.225729 (50 iterations in 6.37 seconds)
## Iteration 1450: error is 0.225728 (50 iterations in 6.50 seconds)
## Iteration 1500: error is 0.225730 (50 iterations in 6.48 seconds)
## Iteration 1550: error is 0.225729 (50 iterations in 6.37 seconds)
## Iteration 1600: error is 0.225730 (50 iterations in 6.39 seconds)
## Iteration 1650: error is 0.225734 (50 iterations in 6.81 seconds)
## Iteration 1700: error is 0.225733 (50 iterations in 6.59 seconds)
## Iteration 1750: error is 0.225732 (50 iterations in 6.42 seconds)
## Iteration 1800: error is 0.225733 (50 iterations in 6.30 seconds)
## Iteration 1850: error is 0.225730 (50 iterations in 6.48 seconds)
## Iteration 1900: error is 0.225729 (50 iterations in 6.46 seconds)
## Iteration 1950: error is 0.225730 (50 iterations in 6.30 seconds)
## Iteration 2000: error is 0.225735 (50 iterations in 6.35 seconds)
## Fitting performed in 284.28 seconds.
tsne_df %>%
ungroup() %>%
select(!where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
unique() -> a
tsne_out %>%
.$Y %>%
as_tibble() %>%
ggplot() +
geom_point(aes(V1, V2, color = a$rich))tsne_out %>%
.$Y %>%
as_tibble() %>%
plot_ly(
title = "Sample title",
x = .$V1,
y = .$V2,
z = .$V3,
type = "scatter3d",
mode = "markers",
color = a$rich
) %>%
layout(title = "Experiment 29")tsne_out %>%
.$Y %>%
as_tibble() %>%
plot_ly(
title = "Sample title",
x = .$V1,
y = .$V2,
z = .$V3,
type = "scatter3d",
mode = "markers",
color = a$rich
) %>%
layout(title = "Experiment 29")library(Rtsne)
set.seed(42)
tsne_df <- cdr %>%
# filter(!rich == "poor") %>%
filter(time == "final") %>%
filter(cycle == "R0_R4") %>%
filter(expgroup == "66")
tsne_df %>%
summarise(across(everything(), ~ all(sum(is.na(.x))))) %>%
select(where(isTRUE))## # A tibble: 1 x 2
## # Groups: expgroup, cycle [1]
## expgroup cycle
## <chr> <chr>
## 1 66 R0_R4
tsne_df %>%
filter(is.na(threshold)) %>%
select(rich, threshold, quantity)## # A tibble: 0 x 6
## # Groups: expgroup, cycle, time [0]
## # … with 6 variables: expgroup <chr>, cycle <chr>, time <chr>, rich <fct>,
## # threshold <dbl>, quantity <int>
set.seed(42)
tsne_out <-
tsne_df %>%
ungroup() %>%
select(!where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
unique() %>%
Rtsne(
X = .,
dims = 3,
perplexity = 30,
theta = 0.5,
max_iter = 1E3,
verbose = T,
pca_center = T,
pca_scale = T,
normalize = T,
eta = 200.0,
exaggeration_factor = 12.0,
num_threads = parallel::detectCores() - 2
)## Performing PCA
## Read the 8436 x 42 data matrix successfully!
## OpenMP is working. 6 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 7.15 seconds (sparsity = 0.015493)!
## Learning embedding...
## Iteration 50: error is 95.444063 (50 iterations in 18.25 seconds)
## Iteration 100: error is 92.761432 (50 iterations in 24.37 seconds)
## Iteration 150: error is 91.017719 (50 iterations in 12.25 seconds)
## Iteration 200: error is 90.890341 (50 iterations in 10.68 seconds)
## Iteration 250: error is 90.831025 (50 iterations in 10.93 seconds)
## Iteration 300: error is 2.973973 (50 iterations in 8.21 seconds)
## Iteration 350: error is 2.540478 (50 iterations in 7.78 seconds)
## Iteration 400: error is 2.310407 (50 iterations in 8.07 seconds)
## Iteration 450: error is 2.166549 (50 iterations in 7.52 seconds)
## Iteration 500: error is 2.067423 (50 iterations in 7.54 seconds)
## Iteration 550: error is 1.995633 (50 iterations in 7.60 seconds)
## Iteration 600: error is 1.942256 (50 iterations in 8.12 seconds)
## Iteration 650: error is 1.902282 (50 iterations in 8.24 seconds)
## Iteration 700: error is 1.871328 (50 iterations in 8.12 seconds)
## Iteration 750: error is 1.847140 (50 iterations in 8.41 seconds)
## Iteration 800: error is 1.828978 (50 iterations in 8.32 seconds)
## Iteration 850: error is 1.815785 (50 iterations in 8.41 seconds)
## Iteration 900: error is 1.805099 (50 iterations in 8.40 seconds)
## Iteration 950: error is 1.796396 (50 iterations in 8.66 seconds)
## Iteration 1000: error is 1.788672 (50 iterations in 8.51 seconds)
## Fitting performed in 198.40 seconds.
tsne_df %>%
ungroup() %>%
select(!where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
unique() -> a
tsne_out %>%
.$Y %>%
as_tibble() %>%
ggplot() +
geom_point(aes(V1, V2, color = a$rich))tsne_out %>%
.$Y %>%
as_tibble() %>%
plot_ly(
x = .$V1,
y = .$V2,
z = .$V3,
type = "scatter3d",
mode = "markers",
color = a$rich
) %>%
layout(title = "Experiment 66")library(Rtsne)
set.seed(42)
tsne_df <- cdr %>%
# filter(!rich == "poor") %>%
filter(time == "final") %>%
filter(cycle == "R0_R4") %>%
filter(expgroup == "29")
tsne_df %>%
summarise(across(everything(), ~ all(sum(is.na(.x))))) %>%
select(where(isTRUE))## # A tibble: 1 x 2
## # Groups: expgroup, cycle [1]
## expgroup cycle
## <chr> <chr>
## 1 29 R0_R4
tsne_df %>%
filter(is.na(threshold)) %>%
select(rich, threshold, quantity)## # A tibble: 0 x 6
## # Groups: expgroup, cycle, time [0]
## # … with 6 variables: expgroup <chr>, cycle <chr>, time <chr>, rich <fct>,
## # threshold <dbl>, quantity <int>
set.seed(42)
tsne_out <-
tsne_df %>%
ungroup() %>%
select(cdr3, !where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
select(!c("quantity", "cdrp", "fcp", "fcq", "rich", "threshold")) %>%
unique() %>%
mutate(cdr3 = as.factor(cdr3)) %>%
Rtsne(
X = .,
dims = 3,
perplexity = 30,
theta = 0.5,
max_iter = 2E3,
verbose = T,
pca_center = T,
pca_scale = T,
normalize = T,
partial_pca = T,
eta = 200.0,
exaggeration_factor = 12.0,
num_threads = parallel::detectCores() - 2
)## Performing PCA
## Read the 1589 x 50 data matrix successfully!
## OpenMP is working. 6 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.48 seconds (sparsity = 0.079423)!
## Learning embedding...
## Iteration 50: error is 73.611075 (50 iterations in 2.26 seconds)
## Iteration 100: error is 69.273657 (50 iterations in 1.37 seconds)
## Iteration 150: error is 68.914566 (50 iterations in 1.44 seconds)
## Iteration 200: error is 68.813606 (50 iterations in 1.50 seconds)
## Iteration 250: error is 68.765041 (50 iterations in 1.61 seconds)
## Iteration 300: error is 1.384633 (50 iterations in 1.31 seconds)
## Iteration 350: error is 1.190552 (50 iterations in 1.25 seconds)
## Iteration 400: error is 1.123723 (50 iterations in 1.29 seconds)
## Iteration 450: error is 1.099565 (50 iterations in 1.31 seconds)
## Iteration 500: error is 1.083687 (50 iterations in 1.37 seconds)
## Iteration 550: error is 1.075679 (50 iterations in 1.38 seconds)
## Iteration 600: error is 1.066637 (50 iterations in 1.38 seconds)
## Iteration 650: error is 1.056727 (50 iterations in 1.37 seconds)
## Iteration 700: error is 1.049040 (50 iterations in 1.44 seconds)
## Iteration 750: error is 1.042673 (50 iterations in 1.52 seconds)
## Iteration 800: error is 1.037077 (50 iterations in 1.27 seconds)
## Iteration 850: error is 1.033224 (50 iterations in 1.29 seconds)
## Iteration 900: error is 1.029217 (50 iterations in 1.27 seconds)
## Iteration 950: error is 1.026656 (50 iterations in 1.30 seconds)
## Iteration 1000: error is 1.024118 (50 iterations in 1.53 seconds)
## Iteration 1050: error is 1.021757 (50 iterations in 1.41 seconds)
## Iteration 1100: error is 1.020240 (50 iterations in 1.43 seconds)
## Iteration 1150: error is 1.018609 (50 iterations in 1.38 seconds)
## Iteration 1200: error is 1.016400 (50 iterations in 1.38 seconds)
## Iteration 1250: error is 1.015295 (50 iterations in 1.38 seconds)
## Iteration 1300: error is 1.013907 (50 iterations in 1.39 seconds)
## Iteration 1350: error is 1.012659 (50 iterations in 1.37 seconds)
## Iteration 1400: error is 1.011199 (50 iterations in 1.39 seconds)
## Iteration 1450: error is 1.009526 (50 iterations in 1.39 seconds)
## Iteration 1500: error is 1.008536 (50 iterations in 1.41 seconds)
## Iteration 1550: error is 1.007399 (50 iterations in 1.42 seconds)
## Iteration 1600: error is 1.005997 (50 iterations in 1.43 seconds)
## Iteration 1650: error is 1.004614 (50 iterations in 1.42 seconds)
## Iteration 1700: error is 1.003587 (50 iterations in 1.66 seconds)
## Iteration 1750: error is 1.002543 (50 iterations in 1.35 seconds)
## Iteration 1800: error is 1.001491 (50 iterations in 1.31 seconds)
## Iteration 1850: error is 1.000320 (50 iterations in 1.36 seconds)
## Iteration 1900: error is 0.999214 (50 iterations in 1.43 seconds)
## Iteration 1950: error is 0.998373 (50 iterations in 1.29 seconds)
## Iteration 2000: error is 0.997141 (50 iterations in 1.29 seconds)
## Fitting performed in 56.36 seconds.
tsne_df %>%
ungroup() %>%
select(cdr3, !where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
select(!c("quantity", "cdrp", "fcp", "fcq", "threshold")) %>%
unique() -> a
tsne_out %>%
.$Y %>%
as_tibble() %>%
ggplot() +
geom_point(aes(V1, V2, color = a$rich))tsne_out %>%
.$Y %>%
as_tibble() %>%
plot_ly(
x = .$V1,
y = .$V2,
z = .$V3,
type = "scatter3d",
mode = "markers",
color = a$rich
) %>%
layout(title = "Experiment 29")library(Rtsne)
set.seed(42)
tsne_df <- cdr %>%
# filter(!rich == "poor") %>%
filter(time == "final") %>%
filter(cycle == "R0_R4") %>%
filter(expgroup == "66")
tsne_df %>%
summarise(across(everything(), ~ all(sum(is.na(.x))))) %>%
select(where(isTRUE))## # A tibble: 1 x 2
## # Groups: expgroup, cycle [1]
## expgroup cycle
## <chr> <chr>
## 1 66 R0_R4
tsne_df %>%
filter(is.na(threshold)) %>%
select(rich, threshold, quantity)## # A tibble: 0 x 6
## # Groups: expgroup, cycle, time [0]
## # … with 6 variables: expgroup <chr>, cycle <chr>, time <chr>, rich <fct>,
## # threshold <dbl>, quantity <int>
set.seed(42)
tsne_out <-
tsne_df %>%
ungroup() %>%
select(cdr3, !where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
select(!c("quantity", "cdrp", "fcp", "fcq", "rich", "threshold")) %>%
unique() %>%
mutate(cdr3 = as.factor(cdr3)) %>%
Rtsne(
X = .,
dims = 3,
perplexity = 30,
theta = 0.5,
max_iter = 2E3,
verbose = T,
pca_center = T,
pca_scale = T,
normalize = T,
partial_pca = T,
eta = 200.0,
exaggeration_factor = 12.0,
num_threads = parallel::detectCores() - 2
)## Performing PCA
## Read the 8552 x 50 data matrix successfully!
## OpenMP is working. 6 threads.
## Using no_dims = 3, perplexity = 30.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 8.64 seconds (sparsity = 0.016355)!
## Learning embedding...
## Iteration 50: error is 94.546345 (50 iterations in 20.90 seconds)
## Iteration 100: error is 94.053600 (50 iterations in 27.91 seconds)
## Iteration 150: error is 94.007204 (50 iterations in 13.49 seconds)
## Iteration 200: error is 94.001788 (50 iterations in 17.41 seconds)
## Iteration 250: error is 94.000470 (50 iterations in 13.39 seconds)
## Iteration 300: error is 3.809958 (50 iterations in 15.10 seconds)
## Iteration 350: error is 3.245190 (50 iterations in 9.99 seconds)
## Iteration 400: error is 2.998464 (50 iterations in 9.70 seconds)
## Iteration 450: error is 2.848536 (50 iterations in 9.44 seconds)
## Iteration 500: error is 2.747522 (50 iterations in 9.80 seconds)
## Iteration 550: error is 2.675168 (50 iterations in 9.69 seconds)
## Iteration 600: error is 2.620772 (50 iterations in 9.78 seconds)
## Iteration 650: error is 2.577785 (50 iterations in 9.89 seconds)
## Iteration 700: error is 2.543844 (50 iterations in 9.52 seconds)
## Iteration 750: error is 2.516923 (50 iterations in 9.30 seconds)
## Iteration 800: error is 2.495979 (50 iterations in 9.80 seconds)
## Iteration 850: error is 2.479166 (50 iterations in 9.81 seconds)
## Iteration 900: error is 2.465158 (50 iterations in 9.81 seconds)
## Iteration 950: error is 2.453984 (50 iterations in 10.02 seconds)
## Iteration 1000: error is 2.444859 (50 iterations in 9.56 seconds)
## Iteration 1050: error is 2.436664 (50 iterations in 9.81 seconds)
## Iteration 1100: error is 2.429402 (50 iterations in 10.33 seconds)
## Iteration 1150: error is 2.423092 (50 iterations in 10.04 seconds)
## Iteration 1200: error is 2.417693 (50 iterations in 10.41 seconds)
## Iteration 1250: error is 2.412627 (50 iterations in 10.62 seconds)
## Iteration 1300: error is 2.408001 (50 iterations in 10.42 seconds)
## Iteration 1350: error is 2.403214 (50 iterations in 11.04 seconds)
## Iteration 1400: error is 2.399291 (50 iterations in 10.63 seconds)
## Iteration 1450: error is 2.395647 (50 iterations in 10.78 seconds)
## Iteration 1500: error is 2.392276 (50 iterations in 11.01 seconds)
## Iteration 1550: error is 2.388986 (50 iterations in 10.96 seconds)
## Iteration 1600: error is 2.386141 (50 iterations in 11.30 seconds)
## Iteration 1650: error is 2.383364 (50 iterations in 11.35 seconds)
## Iteration 1700: error is 2.380737 (50 iterations in 11.50 seconds)
## Iteration 1750: error is 2.377941 (50 iterations in 11.48 seconds)
## Iteration 1800: error is 2.375171 (50 iterations in 11.11 seconds)
## Iteration 1850: error is 2.372426 (50 iterations in 11.43 seconds)
## Iteration 1900: error is 2.369839 (50 iterations in 10.95 seconds)
## Iteration 1950: error is 2.367222 (50 iterations in 10.89 seconds)
## Iteration 2000: error is 2.364811 (50 iterations in 10.61 seconds)
## Fitting performed in 460.99 seconds.
tsne_df %>%
ungroup() %>%
select(cdr3, !where(is.character)) %>%
select(!c(which(apply(., 2, var)==0))) %>%
select(!c("quantity", "cdrp", "fcp", "fcq", "threshold")) %>%
unique() -> a
tsne_out %>%
.$Y %>%
as_tibble() %>%
ggplot() +
geom_point(aes(V1, V2, color = a$rich))tsne_out %>%
.$Y %>%
as_tibble() %>%
plot_ly(
x = .$V1,
y = .$V2,
z = .$V3,
type = "scatter3d",
mode = "markers",
color = a$rich
) %>%
layout(title = "Experiment 66")